The trend of population ageing and child birth in U.S.

Takuya Ando

In [0]:
#@title Hide code
!pip install geopandas
!apt-get -qq install python-cartopy python3-cartopy 
!pip uninstall shapely
!pip install shapely --no-binary shapely
import cartopy 
!pip3 install geoplot 
import statsmodels.iolib.foreign as smio 
from pandas import DataFrame 
import pandas as pd 
import geopandas as gpd
import altair as alt 
import vega_datasets as data 
import json 
import warnings 
import numpy as np 
import matplotlib.pyplot as plt 
import geoplot as gplt 
import geoplot.crs as gcrs 
warnings.filterwarnings('ignore')
In [2]:
#@Hide code
from google.colab import drive
drive.mount('/content/drive')
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
In [3]:
#@Hide code
cd "/content/drive/My Drive/Datavisualization"
/content/drive/My Drive/Datavisualization

Population aging trend in US

In [0]:
urls = ["https://docs.google.com/spreadsheets/d/1sUA47Q8OrH18YI4q_DOP7OWIoHX2i8kH0fk5E4lFszE/export?format=csv&gid=1048618207",
        "https://docs.google.com/spreadsheets/d/18KT-PC9IXKqYL-wd5zHHzYV4K8eZ9mgUe98XaFANMQM/export?format=csv&gid=2072191966", 
        "https://docs.google.com/spreadsheets/d/1p_fq_sG3nYnHdOkLVglxXF2zmT1XrPK0c-538lcFIAg/export?format=csv&gid=1688370404",
        "https://docs.google.com/spreadsheets/d/1ymCdsBHgamJis3_2PK0IZS_VZvls5Umn-ng9o4S0Sro/export?format=csv&gid=119942101",
        "https://docs.google.com/spreadsheets/d/1AqUeUfzuAUG5pCLJGbrysCcTxICCrHsA8rRE1ffHZpU/export?format=csv&gid=1384942932",
        "https://docs.google.com/spreadsheets/d/1zvfu9Nc27aGo2ovAn2kqofv-iVXO61-n-dQ4DIrpbUs/export?format=csv&gid=2105631292",
        "https://docs.google.com/spreadsheets/d/1XAXIQeLrgxgEx6CCHxWOmfgmInRxwCxu-4vw1PR8dp8/export?format=csv&gid=1064535711",
        "https://docs.google.com/spreadsheets/d/1xad1XOm-J4T13rYFJ7BBeskaaCV88BAQZExqNX3aX0w/export?format=csv&gid=95139586",
        "https://docs.google.com/spreadsheets/d/1pBj5q5NkzpZUKeOqFZCGb0RdzV1mvS8rw2iXWcwYKbc/export?format=csv&gid=616415653",
        "https://docs.google.com/spreadsheets/d/1dZBbXEKSRqhR8Yy9_zFBAjzdw_Gyu3czYT0GNcGhrZI/export?format=csv&gid=687388009",
        "https://docs.google.com/spreadsheets/d/1pReietRze4QKpMOM6dbJeJ4bdekcGgrxcaC5k4Qv8C8/export?format=csv&gid=2108461693",
        "https://docs.google.com/spreadsheets/d/1KkqQ8U4vL5Tjy7Cq16FQD7xi3qnfPnyYiMYQuiJ6Hkc/export?format=csv&gid=957537605"]

dfs =[]

for url in urls:
  df = pd.read_csv(url, encoding='latin-1')[1:]
  dfs.append(df)

pop65_conc = pd.concat(dfs)

col_use = ["date", "HC01_EST_VC01", "HC02_EST_VC01"]

pop65_conc = pop65_conc[col_use]

pop65_conc["HC01_EST_VC01"]=pop65_conc["HC01_EST_VC01"].astype("int")
pop65_conc["HC02_EST_VC01"]=pop65_conc["HC02_EST_VC01"].astype("int")

pop65_conc["pop65_rate"] = pop65_conc["HC02_EST_VC01"]/pop65_conc["HC01_EST_VC01"]

pop65_conc["date"] =pd.to_datetime(pop65_conc["date"])
pop65_conc["year"] = pop65_conc["date"].dt.year
pop65_conc = pop65_conc.drop("date", axis=1)
pd.set_option('display.max_rows', 10000)
pop65_conc["year"] = pop65_conc["year"].astype("str") 
In [164]:
step = 30
overlap = 0.5

alt.themes.enable('latimes')

chart = alt.Chart(pop65_conc, height=step, width=700).transform_joinaggregate(
    mean_rate='mean(pop65_rate)', groupby=['year']
).transform_bin(
   ['bin_max', 'bin_min'], field='pop65_rate', bin=alt.BinParams(step=0.03)
).transform_aggregate(
    value='count()', groupby=['year','mean_rate', 'bin_min', 'bin_max']
).transform_impute(
    impute='value', groupby=['year','mean_rate'], key='bin_min', value=0
).mark_area(
    interpolate='monotone',
    fillOpacity=0.8,
    stroke='lightgray',
    strokeWidth=0.5
).encode(
    alt.X('bin_min:Q', bin='binned', title='population rate of over 65',
        axis=alt.Axis(title='Population rate of over age 65', ),
        scale=alt.Scale(domain=(0.05, 0.30))),
    alt.Y(
        'value:Q',
        scale=alt.Scale(range=[step, -step * overlap]),
        axis= None 
    ),
    alt.Fill(
        'mean_rate:Q',
        legend=alt.Legend(title="Population rate of over age65"),
        scale=alt.Scale(domain=[0.16, 0.11], scheme='inferno')
    )
).facet(
    row = alt.Row(
        'year:N',
        title="year",
        header=alt.Header(titleFontSize=13, labelAngle=0, labelFontSize=12, labelAlign="left")
    )
).properties(
    title={
      "text": ["US Population has been ageing clearly in last 10 years"], 
      "subtitle": ["Distribution of over65 population rate for each county"],
      "color": "blue",
      "subtitleColor": "black",
      "subtitleFontSize":18,
      "subtitleFontWeight":"bold"
    },
    bounds='flush'
).configure_facet(
    spacing=0
).configure_view(
    stroke=None
).configure_title(
    anchor='end'
)

config = chart.configure_title(
    fontSize=24,
    fontWeight="bold",
    anchor="middle"
).configure_legend(
    titleColor='black',
    titleFontSize=13,
    titleFontWeight="bold",
    orient='right',
    fillColor="#ebf5ff",
    padding=10
).configure_axisX(
    titleFontWeight="bold",
    labelFontSize=11,
    grid=True,
    gridDash=[2,3]
)

config
Out[164]:
Source: American community survey

The ridgeline plot above show the trend of distribution of over age 65 population from 2006 to 2017 for county level. We can see that, in earlier period, the proportion of people over age of 65 was more concentrated around the mean of about 13 % and there were very limited number of counties which recorded over 20 %. Then, in recent years, we can see that the tail becomes longer and the proportion is distributed toward higher rate, and there are some part of counties which record over 24 % of ageing population.

In [0]:
sc00_18 = pd.read_csv("sc00_18.csv")

sc00_18_wrkag = sc00_18[(sc00_18["AGE"]>=15) & (sc00_18["AGE"]<=64)]

sc00_18_ov65 = sc00_18[(sc00_18["AGE"]>=65) & (sc00_18["AGE"]!=999)]

sc00_18_wrkag_div = sc00_18_wrkag.groupby(["DIVISION"]).sum()

sc00_18_ov65_div = sc00_18_ov65.groupby(["DIVISION"]).sum()

sc00_18_wrkag_divt =sc00_18_wrkag_div.T[8:]

div_dic = {0:"USA", 1:"New England", 2:"Mid-Atlantic", 3:"East North Central", 4:"West North Central", \
       5:"South Atlantic", 6:"East South Central", 7:"West South Central", 8:"Mountain", 9:"Pacific"}

div_list = []
for i in range(0, 10):
    sc00_18_wrkag_divt_col = sc00_18_wrkag_divt[i].reset_index()
    year = sc00_18_wrkag_divt_col["index"].apply(lambda x: x[6:10])
    sc00_18_wrkag_divt_col["index"]=year 
    sc00_18_wrkag_divt_col = sc00_18_wrkag_divt_col.rename(columns={"index":"year", i:"pop"})
    sc00_18_wrkag_divt_col["pct_chg"] = sc00_18_wrkag_divt_col["pop"].pct_change()
    sc00_18_wrkag_divt_col["pct_chg"][0] = 0
    sc00_18_wrkag_divt_col["pct_chg"] = sc00_18_wrkag_divt_col["pct_chg"]*100 +100

    sc00_18_wrkag_divt_col

    sc00_18_wrkag_divt_col["div"] = i
    
    div = sc00_18_wrkag_divt_col["div"].apply(lambda x: div_dic[i])
    
    sc00_18_wrkag_divt_col["div"] = div
    
    div_list.append(sc00_18_wrkag_divt_col)

conc_div = pd.concat(div_list)
In [167]:
conc_div["a"] =100

alt.themes.enable('latimes')

box = pd.DataFrame({'x1': [2000], 'x2': [2018], 'y1': [0], 'y2': [100]})

domain = ["USA", "New England", "Mid-Atlantic","East North Central", "West North Central",\
          "South Atlantic", "East South Central", "West South Central", "Mountain", "Pacific"]

range_ = ['red', 'blue', 'green','orange','pink','light-gray','light-gray','light-gray','light-gray','light-grey']

chart = alt.Chart(conc_div, width=800, height=400).mark_line().encode(
    x=alt.X('year:O',
    axis=alt.Axis(title="year")),
    y = alt.Y('pct_chg:Q',
        scale=alt.Scale(domain=(99, 103),),
        axis=alt.Axis(title="percent_change(indexed to 100)")
    ),
    color=alt.Color('div:N', scale=alt.Scale(domain=domain, range=range_),
                   legend=alt.Legend(title="Division",
                                     values=["USA", "New England", "Mid-Atlantic","East North Central","West North Central",
                                             "(light-grey)Other divisions"]))
).properties(
    title={
      "text": ["Working-age population starts declining in north-east areas"], 
      "subtitle": ["The change rate of working-age population for each division"],
      "color": "blue",
      "subtitleColor": "black",
      "subtitleFontSize":18,
      "subtitleFontWeight":"bold"
    }
)

box = alt.Chart(conc_div).mark_rect(fill='#75bbff', stroke='none',strokeOpacity=0, fillOpacity=0.03).encode(
    alt.X('year:N'),
    alt.Y('a')
)

rule = alt.Chart(conc_div).mark_rule(color='black', strokeDash=[2,3], size=2).encode(
    y='a:Q'
)

config = (chart+box+rule).configure_title(
    fontSize=24,
    fontWeight="bold",
    anchor="middle"
).configure_legend(
    titleColor='black',
    titleFontSize=12,
    titleFontWeight="bold",
    orient='right',
    fillColor="#ebf5ff",
    padding=10
).configure_axisX(
    labelFontSize=10,
    labelAngle=-45,
    grid=True,
    gridDash=[2,3],
    titleFontWeight="bold"
).configure_axisY(
    labelFontSize=11,
    grid=True,
    gridDash=[2,3],
    titleFontWeight="bold"
    )

config
Out[167]:
Source: American community survey

The line graph above shows how the working-age population(15-64 years old) has been changing from 2000 to 2018 for different divisions. Overall, the working-age population is declining for most of the divisions. In some divisions the working-age population is even starts to decline. We can see such kind of divisions are mainly in north-eastern part of US.

In [0]:
sc00_18_rate = sc00_18_wrkag_div

for i in range(8, len(sc00_18_rate.columns)):
    sc00_18_rate.iloc[:,i] = np.divide(sc00_18_rate.iloc[:,i], sc00_18_ov65_div.iloc[:, i])

div_dic = {0:"USA", 1:"New England", 2:"Mid-Atlantic", 3:"East North Central", 4:"West North Central", \
       5:"South Atlantic", 6:"East South Central", 7:"West South Central", 8:"Mountain", 9:"Pacific"}

sc00_18_rate2000 = sc00_18_rate.iloc[:, 8].reset_index()
sc00_18_rate2000["DIVISION"] = sc00_18_rate2000["DIVISION"].apply(lambda x: div_dic[x])
sc00_18_rate2000 = sc00_18_rate2000.rename(columns={"DIVISION":"Division", "POPEST2000_CIV": "per_wk"})
sc00_18_rate2000["Division"][2] = "Middle Atlantic"

sc00_18_rate2018 = sc00_18_rate.iloc[:,26].reset_index()
sc00_18_rate2018["DIVISION"] = sc00_18_rate2018["DIVISION"].apply(lambda x: div_dic[x])
sc00_18_rate2018 = sc00_18_rate2018.rename(columns={"DIVISION":"Division", "POPEST2018_CIV": "per_wk"})
sc00_18_rate2018["Division"][2] = "Middle Atlantic"

sc00_18_rate2000_div = sc00_18_rate2000[1:]
sc00_18_rate2018_div = sc00_18_rate2018[1:]

div = gpd.read_file("cb_2018_us_division_500k.shp")
col_use=["NAME","geometry"]
div = div[col_use]
div = div.rename(columns={"NAME":"Division"}) 

gdf1 = pd.merge(div, sc00_18_rate2000_div)

div["lon_lat"]= div.centroid
div2=div.drop(["geometry"], axis=1)
div2 = div2.rename(columns={"lon_lat":"geometry"})
div2["longitude"]= div2.geometry.x
div2["latitude"]= div2.geometry.y

div3 = pd.merge(div2, sc00_18_rate2000_div)
div3["per_wk"] = div3["per_wk"]
div3["longitude"] = div3["longitude"]

div4 = pd.merge(div2, sc00_18_rate2018_div)
div4["per_wk"] = div4["per_wk"]
div4["longitude"] = div4["longitude"]
In [171]:
choro_json = json.loads(gdf1.to_json())
choro_data = alt.Data(values=choro_json['features'])

alt.themes.enable('latimes')

base = alt.Chart(choro_data).mark_geoshape(
        fill="lightgrey",
        stroke='white',
        strokeWidth=1
    ).encode(
    ).project(
    type='albersUsa'
    ).properties(
        title={
      "text": "Lesser younger generation against elderly people", 
      "subtitle": ["The number of 65 over population per working-age person in 2000 and 2018"],
      "color": "blue",
      "subtitleColor": "black",
      "subtitleFontSize":18,
      "subtitleFontWeight":"bold"
    },width=700,
    height=500
    )

points1 = alt.Chart(div3).mark_circle(color="red", fillOpacity=0.8).encode(
    latitude="latitude:Q",
    longitude="longitude:Q",
    size=alt.Size("per_wk:Q",scale=alt.Scale(domain=[3.5, 6.1], range=[0, 3000]),
                  legend=alt.Legend(title="working-age/65over(2000)")
                  ),
    tooltip=["Division:N", "per_wk:Q"]
)

points2 = alt.Chart(div4).mark_circle(color="blue", fillOpacity=0.8).encode(
    latitude="latitude:Q",
    longitude="longitude:Q",
    size=alt.Size("per_wk:Q",scale=alt.Scale(domain=[3.5, 6.1], range=[0, 3000]),
                  legend=alt.Legend(title="working-age/65over(2018)")
),
    tooltip=["Division:N", "per_wk:Q"]
)

points = (points1 + points2).resolve_scale(size='independent')

config = (base + points)

config = config.configure_title(
    fontSize=24,
    fontWeight="bold", 
    anchor="middle"
).configure_legend(
    titleColor='black',
    titleFontSize=12,
    titleFontWeight="bold",
    symbolType="circle",
    orient='right',
    fillColor="#ebf5ff",
    padding=10
).configure_axisX(
    labelFontSize=10,
    labelAngle=-45
).configure_axisY(
    labelFontSize=10)

config
Out[171]:
Source: Census Bureau, population estimates

Then, we will see how the proportion of eldery people against younger generation has changed in last two decades. The red circles in the plot indicate the number of younger people for one elderly people in working age in 2000 for each division. We can see that for all of regions the rate was more than 4.7. However, in 2018 for all divisions the rate declined, and now the range is 3.7 to 4.8. In the central area the pace seems relatively moderate, but in some areas such as South atlantic and New England the rate declines rapidly, which means fewer younger generations need to support elderly people.

In [0]:
aging = pd.read_csv("https://docs.google.com/spreadsheets/d/1ggeuLBBugr7yyctZ1qM_evCxKdBYqq_LaDRRRmklsCY/export?format=csv&gid=1073389986", encoding='latin-1')[1:]

use_cols = ["YEAR","HC01_EST_VC01","HC02_EST_VC01","HC02_EST_VC09","HC02_EST_VC10","HC02_EST_VC11",\
            "HC02_EST_VC12", "HC02_EST_VC13", "HC02_EST_VC14", "HC02_EST_VC15", "HC02_EST_VC16", "HC02_EST_VC18"]

aging = aging[use_cols]

aging.columns = ["Year", "total_pop", "total_pop_65", "total_pop65_onerace", "White", "Black",\
                 "Native","Asian","hawai","other", "two_or_more", "Hispanic or Latino"]

aging["Year"]= aging["Year"].astype("int")
aging["White"] = aging["White"].astype("float")
aging["Black"] = aging["Black"].astype("float")
aging["Native"] = aging["Native"].astype("float")
aging["Asian"] = aging["Asian"].astype("float")
aging["hawai"] = aging["hawai"].astype("float")
aging["other"] = aging["other"].astype("float")
aging["two_or_more"] = aging["two_or_more"].astype("float")
aging["Hispanic or Latino"] = aging["Hispanic or Latino"].astype("float")

aging["Others"] = aging["hawai"] + aging["other"] + aging["two_or_more"]

aging_nonhis = aging.drop(["hawai", "other", "two_or_more","Hispanic or Latino"], axis=1)

aging_his = aging[["Year", "Hispanic or Latino"]]
aging_his["Race"] = "Hispanic or Latino"
In [0]:
white = aging_nonhis[["Year","White"]]
white["Race"]="White"
white = white.rename(columns={"White":"pop65"})
white = white.sort_values(by=["Year"])

black = aging_nonhis[["Year","Black"]]
black["Race"]="Black"
black = black.rename(columns={"Black":"pop65"})
black = black.sort_values(by=["Year"])

native = aging_nonhis[["Year","Native"]]
native["Race"]="Native"
native = native.rename(columns={"Native":"pop65"})
native = native.sort_values(by=["Year"])

asian = aging_nonhis[["Year","Asian"]]
asian["Race"]="Asian"
asian = asian.rename(columns={"Asian":"pop65"})
asian = asian.sort_values(by=["Year"])

other = aging_nonhis[["Year","Others"]]
other["Race"]="Others"
other = other.rename(columns={"Others":"pop65"})
other = other.sort_values(by=["Year"])

aging_conc_nonhis = pd.concat([white, black, native, asian, other])
In [192]:
col_domain = ["White", "Black", "Native", "Asian", "Others"]

col_range = ["yellow", "blue", "red", "green", "pink"]

chart1 = alt.Chart(aging_conc_nonhis).mark_area(fillOpacity=0.6).encode(
    x="Year:N",
    y=alt.Y("pop65:Q",
    scale=alt.Scale(domain=[0,100]),
    axis=alt.Axis(title="Population of people over65(%)")
    ),
    color=alt.Color("Race:N",
                    scale= alt.Scale(domain=col_domain, range=col_range))
).properties(
    width=500,
    height=400,
    title={
      "text": "Race of elderly people is being gradually diverged",
      "subtitle": ["Racial structure for people over 65 years old in US(2006-2017)"],
      "subtitleColor": "black",
      "subtitleFontSize":18,
      "subtitleFontWeight":"bold"
      }
)

chart2 = alt.Chart(aging_his).mark_area(fillOpacity=0.6).encode(
    x="Year:N",
    y="Hispanic or Latino:Q",
    color=alt.Color("Race:N",
                    scale = alt.Scale(range=["orange"]),
                    legend=alt.Legend(title=None)))


chart = (chart1 + chart2).resolve_scale(color='independent')

config = chart.configure_title(
    fontSize=24,
    color="blue",
    fontWeight="bold",
    anchor="middle"
).configure_legend(
    titleColor='black',
    titleFontSize=13,
    labelFontSize=11,
    orient='right',
    fillColor="#ebf5ff",
    padding=10
).configure_axisX(
    titleFontSize=13,
    titleFontWeight="bold",
    labelFontSize=11,
    labelAngle=-45,
    grid=True,
    gridDash=[2,3]
).configure_axisY(
    titleFontSize=13,
    titleFontWeight="bold",
    labelFontSize=11)

config
Out[192]:
Source: American community survey

Then, what about racial structure of population over age of 65? The stacked bar chart above shows how the racial structure of population over 65 years old has changed these 10 years. We can see that through entire period the majority of population over 65 years old have been white, still over 80 percent. However, the proportion of white slightly have decreased and the proportion of asian and hispanic have been increased gradually.

Child birth trend

In [193]:
df = pd.read_csv("fertility_data2.csv")

alt.themes.enable('latimes')

base = alt.Chart(df).properties(
    width=700,
    height=300,
    title={
      "text": "Fertility rate tends to be higher in rural areas",
      "subtitle": ["Fertility rate per 1000 women for each county"],
      "subtitleColor": "black",
      "subtitleFontSize":18,
      "subtitleFontWeight":"bold"
      }
)

color_scale = alt.Scale(domain=['Midwest', 'Northeast', 'South','West', 'nan'],
                        range=['#98df8a', '#ff7f00','#ffed6f','#8624F5','#9d755d'])

hists = base.mark_bar(opacity=0.6, thickness=100).encode(
    x=alt.X('fert_13_17',
            bin=alt.Bin(maxbins=140), # step keeps bin size the same
            scale=alt.Scale(domain=[0,200]),
            axis=alt.Axis(title='Fertility_rate per 1000 women')),
    y=alt.Y('count()',
            stack=None,
            scale=alt.Scale(domain=[0,100])),
    color=alt.Color('region:N',scale=color_scale)
).configure_title(
    fontSize=24,
    color="blue",
    fontWeight="bold",
    anchor="middle"
).configure_legend(
    titleColor='black',
    titleFontSize=12,
    titleFontWeight="bold",
    orient='right',
    fillColor="#ebf5ff",
    padding=10
).configure_axisX(
    labelFontSize=10,
    grid=False,
    gridDash=[2,3],
    titleFontWeight="bold"
).configure_axisY(
    labelFontSize=11,
    grid=True,
    gridDash=[2,3],
    titleFontWeight="bold"
)

hists
Out[193]:
The histogram above shows the distribution of fertility rate per 1000 women for five different regions. Although absolute number of records are different, there are not big difference between each region in terms of distribution of fertility rate. For most of regions fertility rate is normally distributed around the peak of about 50 per 1000 women.
In [0]:
county = gpd.read_file("UScounties.shp")

county = county.rename(columns={"FIPS":"geoid"})
county["geoid"]= county["geoid"].astype(int)
county = county[["geoid", "geometry"]]

gdf_fert = county.merge(df, on="geoid")

choro_json = json.loads(gdf_fert.to_json())
choro_data = alt.Data(values=choro_json['features'])
In [0]:
def evaluateForBivariate(row, x_bl, x_bh, y_bl, y_bh):
    colorMatrix = [["#fd8d3c", "#843c39", "#393b79"],
                   ["#f1e2cc", "#cccccc", "#4c78a8"],
                   ["#ffffcc", "#c6bdef", "#3182bd"],]
  
    xBoundaries = [x_bl, x_bh]
    xIdx = 0
    if row["total_pop"] < xBoundaries[0]:
        xIdx = 0
    elif row["total_pop"] < xBoundaries[1]:
        xIdx = 1
    else:
        xIdx = 2
  
    yBoundaries = [y_bl, y_bh]
    yIdx = 0
    if row["fert_13_17"] < yBoundaries[0]:
        yIdx = 2
    elif row["fert_13_17"] < yBoundaries[1]:
        yIdx = 1
    else:
        yIdx = 0
    return colorMatrix[yIdx][xIdx]; 
In [0]:
pop_33 = np.percentile(df["total_pop"], 33.3)
pop_66 = np.percentile(df["total_pop"], 66.7)

fert_33 = np.percentile(df["fert_13_17"], 33.3)

fert_66 = np.percentile(df["fert_13_17"], 66.7)

color = []

for i in range(0, len(df)):
    row = df.iloc[i,:]
    color.append(evaluateForBivariate(row, pop_33, pop_66, fert_33, fert_66))
  
df["color"] = np.array(color)

county = gpd.read_file("UScounties.shp")

county = county.rename(columns={"FIPS":"geoid"})
county["geoid"]= county["geoid"].astype(int)
county = county[["geoid", "geometry"]]

gdf_fert = county.merge(df, on="geoid")

choro_json = json.loads(gdf_fert.to_json())
choro_data = alt.Data(values=choro_json['features'])
In [180]:
color_range = ["#fd8d3c", "#843c39", "#393b79","#f1e2cc", "#cccccc", "#4c78a8", "#ffffcc", "#c6bdef", "#3182bd"]

color_domain = ["#fd8d3c", "#843c39", "#393b79","#f1e2cc", "#cccccc", "#4c78a8", "#ffffcc", "#c6bdef", "#3182bd"]

alt.themes.enable('latimes')

base = alt.Chart(choro_data).mark_geoshape(
        stroke='black',
        strokeWidth=1
    ).encode(
    ).project(
    type='albersUsa'
    ).properties(
        title={
      "text": "Fertility rate tends to be higher in rural areas",
      "subtitle": ["Fertility rate per 1000 women for each county"],
      "subtitleColor": "black",
      "subtitleFontSize":18,
      "subtitleFontWeight":"bold"
    },width=700,
    height=500
    )

    # Add Choropleth Layer
choro = alt.Chart(choro_data).mark_geoshape().encode(
        color = alt.Color("properties.color:N",
                          scale = alt.Scale(domain=color_domain, range=color_range),
                          legend=None,
                  title = "Fertility rate per 1000 women"),
         tooltip=['properties.county:O','properties.fert_13_17:Q']
    ).project(
    type='albersUsa'
    )

config = base+choro

# Legend
x = np.array(['<33 perct.', '<67 perct.', '>67 perct.', '<33 perct.', '<67 perct.', '>67 perct.', '<33 perct.', '<67 perct.', '>67 perct.'])
y = np.array(['>67 perct.', '>67 perct.', '>67 perct.', '<67 perct.', '<67 perct.', '<67 perct.', '<33 perct.', '<33 perct.', '<33 perct.'])

z = np.array([["#fd8d3c", "#843c39", "#393b79"],
     ["#f1e2cc", "#cccccc", "#4c78a8"],
    ["#ffffcc", "#c6bdef", "#3182bd"]], dtype=str)

source = pd.DataFrame({'x': x,
                     'y': y,
                     'z': z.ravel()})

legend = alt.Chart(source).mark_rect().encode(
    x=alt.X('x:N',
            axis=alt.Axis(title="Population",
                          labelFontSize=10,
                          labelAngle=0)),
    y=alt.Y('y:N',
            axis=alt.Axis(title="Fertility rate/1000 women",
                          labelFontSize=10,
                          labelAngle=-90)),
    color=alt.Color("z:N", scale=alt.Scale(domain=color_domain, range=color_range,),
                    legend=None)
).properties(width=200, height=150)

config = alt.hconcat(config, legend)

config = config.configure_title(
    color='blue',
    fontSize=24,
    fontWeight="bold", 
    anchor="middle"
).configure_axisX(
    labelFontSize=11
).configure_axisY(
    labelFontSize=11)

config
Out[180]:
Source: American community survey

The we examine the regionality of fertility rate more precisely. The bivariate choropleth map above shows the color map of fertility rate for each county whose color is categorized by the population level and fertility rate level. We can see that in less populated area tends to have high fertility rate, such as central part and Alaska. On the other hand, highly populated areas in both east and west coastal areas have relatively lower fertility rate.

Load data and preprocessing

In [0]:
df = df.iloc[:,2:]

df["above_med_wh"] = np.where(df["white_pct"]>= np.median(df["white_pct"]), 1, 0)

county["lon_lat"]= county.centroid
county = county[["geoid", "lon_lat"]]
county = county.rename(columns={"lon_lat":"geometry"})

gdf_pts = county.merge(df, on="geoid")

gdf_pts = gdf_pts[gdf_pts["state"] != "Alaska"]
gdf_pts = gdf_pts[gdf_pts["state"] != "Hawaii"]

gdf_ab_wh = gdf_pts[gdf_pts["above_med_wh"]==1]
gdf_bl_wh = gdf_pts[gdf_pts["above_med_wh"]==0]
gdf_ab_wh = gpd.GeoDataFrame(gdf_ab_wh)
gdf_bl_wh = gpd.GeoDataFrame(gdf_bl_wh)

gdf_ab_wh["region"] = gdf_ab_wh["region"].astype(str)
gdf_bl_wh["region"] = gdf_bl_wh["region"].astype(str)
In [0]:
fert_25 = np.percentile(df["fert_13_17"], 25)
fert_50 = np.percentile(df["fert_13_17"], 50)
fert_75 = np.percentile(df["fert_13_17"], 75)

perc_ab = []
perc_bl = []

for i in range(0, len(gdf_ab_wh)):
    row = gdf_ab_wh.iloc[i,:]
    if row["fert_13_17"] < fert_25:
      perc_ab.append("<25 percentile")
    elif row["fert_13_17"] < fert_50:
      perc_ab.append("<50 percentile")
    elif row["fert_13_17"] < fert_75:
      perc_ab.append("<75 percentile")
    else:
      perc_ab.append(">=75 percentile")

for i in range(0, len(gdf_bl_wh)):
    row = gdf_bl_wh.iloc[i,:]
    if row["fert_13_17"] < fert_25:
      perc_bl.append("<25 percentile")
    elif row["fert_13_17"] < fert_50:
      perc_bl.append("<50 percentile")
    elif row["fert_13_17"] < fert_75:
      perc_bl.append("<75 percentile")
    else:
      perc_bl.append(">=75 percentile")


gdf_ab_wh["perc"] = perc_ab
gdf_bl_wh["perc"] = perc_bl
In [196]:
contiguous_usa = gpd.read_file(gplt.datasets.get_path('contiguous_usa'))
fig = plt.figure(figsize=(20,15))
plt.title("Fertility rate for counties with high/low percentage of white people", fontsize=18)
st = fig.suptitle("Racial diversity might affect fertility", fontsize=24,
                  fontweight="bold", color="blue")
proj = projection=gcrs.AlbersEqualArea(central_latitude=45.7128, central_longitude=-100.0059)
ax1 = plt.subplot(211, projection=proj)
ax2 = plt.subplot(212, projection=proj)

ax1 = gplt.pointplot(gdf_ab_wh, projection=proj, scale="fert_13_17", hue='perc',
                     cmap='inferno_r',limits=(1, 20), legend=True, legend_var='hue',
                     edgecolor='white', linewidth=0.5, ax=ax1)

ax2 = gplt.pointplot(gdf_bl_wh, projection=proj, scale="fert_13_17", hue='perc',
                     cmap='inferno_r',limits=(1, 20),legend=True, legend_var="scale",
                     edgecolor='white', linewidth=0.5, ax=ax2)

gplt.polyplot(contiguous_usa,edgecolor='white', facecolor='lightgray',
     ax=ax1)
ax1.set_title("Fertility rate for county with high pct of white, 2013-2017",
              fontdict={'fontsize': 16})

gplt.polyplot(contiguous_usa,edgecolor='white', facecolor='lightgray',
     ax=ax2)
ax2.set_title("Fertility rate for county with low pct of white, 2013-2017",
              fontdict={'fontsize': 16})
fig.tight_layout()
Source: American community survey

Then we examine how the racial structure of each county affect the fertility rate. Here we examine the difference between more "diverse" county and less "diverse" county. The plot above is for counties with higher percent of white people than median of all counties, and the plot below is for counties with lower percent of white people. We can see that in both plots the counties in central area records high fertility rate, but for counties with higher diversity, it seems that the areas with higher fertility rate is a little more widely distributed.

In [199]:
a1 = alt.Chart(df).properties(
    width=300,
    height=200,
    title={
      "text": ["Unemployment rate & poverty rate"], 
      "color": "black",
      "fontSize":13
    }
).mark_rect().encode(
    alt.X('Unemployment_rate_2018:Q', bin=alt.Bin(maxbins=60),
          axis=alt.Axis(title='Unemployment rate')),
    alt.Y('poverty_rate:Q', bin=alt.Bin(maxbins=40),
          axis=alt.Axis(title='Poverty rate')),
    alt.Color("fert_13_17:Q", scale=alt.Scale(domain =[0, 150], scheme='inferno'),
              legend = alt.Legend(title="Fertility rate/1000 women"))
)

a2 = alt.Chart(df).properties(
    width=300,
    height=200,
    title={
      "text": ["Non-high school degree rate & poverty rate"], 
      "color": "black",
      "fontSize":13
    }
).mark_rect().encode(
    alt.X('pct_bl_hs:Q', bin=alt.Bin(maxbins=60),
          axis=alt.Axis(title='Non high school rate')),
    alt.Y('poverty_rate:Q', bin=alt.Bin(maxbins=40),
          axis=alt.Axis(title='Poverty rate')),
    alt.Color("fert_13_17:Q", scale=alt.Scale(domain =[0, 150], scheme='inferno'))
)

a3 = alt.Chart(df).properties(
    width=300,
    height=200,
    title={
      "text": ["Bachelor or higher rate & poverty rate"], 
      "color": "black",
      "fontSize":13
    }
).mark_rect().encode(
    alt.X('pct_bach_high:Q', bin=alt.Bin(maxbins=60),
          axis=alt.Axis(title='Bachelor or higher degree rate')),
    alt.Y('poverty_rate:Q', bin=alt.Bin(maxbins=40),
          axis=alt.Axis(title='Poverty rate')),
    alt.Color("fert_13_17:Q", scale=alt.Scale(domain =[0, 150], scheme='inferno'))
)

a4 = alt.Chart(df).properties(
    width=300,
    height=200,
    title={
      "text": ["Crime rate & poverty rate"], 
      "color": "black",
      "fontSize":13
    }
).mark_rect().encode(
    alt.X('crime_rate:Q', bin=alt.Bin(maxbins=60),
          axis=alt.Axis(title='Crime rate per person')),
    alt.Y('poverty_rate:Q', bin=alt.Bin(maxbins=40),
          axis=alt.Axis(title='Poverty rate')),
    alt.Color("fert_13_17:Q", scale=alt.Scale(domain =[0, 200], scheme='inferno'))
)

chart1 = a1|a2

chart2 = a3|a4

chart = alt.VConcatChart(
    vconcat=[chart1, chart2],
    title={
      "text": "Fertility rate would be correlated with some socioeconomic factors",
      "subtitle": ["The correlation between fertility rate and four socioeconomic factors for county level"],
      "subtitleColor": "black",
      "subtitleFontSize":18,
      "subtitleFontWeight":"bold"
      }
)

chart.configure_title(
    fontSize=24,
    color="blue",
    fontWeight="bold",
    anchor="middle"
).configure_legend(
    titleColor='black',
    titleFontSize=11,
    titleFontWeight="bold",
    orient='right',
    fillColor="#ebf5ff",
    padding=10
).configure_axisX(
    labelFontSize=11
).configure_axisY(
    labelFontSize=11)
Out[199]:
Source: American community survey

Now, we will examine what kind of social economic factors affect fertility rate. Although variety of factors would affect the fertility rate, here we investigate the effect of employment status, education attainment and crime rate. The four plots above show the effect of each variables via poverty rate, since those variables are considered to be highly correlated with poverty rate. For most of vfactors the effect seems subtle, but for the plot of bachelor or higher degree rate, it seems that for counties with lower rate of bachelor or higher degree, the fertility rate seems relatively higher.